
clear 
cd 

* HUDUser ZIP - CBSA
{
import excel "ZIP_CBSA_122010.xlsx", firstrow clear
unique ZIP
gsort ZIP -TOT_RATIO // keep the major one
duplicates drop ZIP, force
keep ZIP CBSA
tempfile temp
save "`temp'"

forvalues i = 2011/2016 {
	import excel "ZIP_CBSA_12`i'.xlsx", firstrow clear
	unique ZIP
	gsort ZIP -TOT_RATIO // keep the major one
	duplicates drop ZIP, force
	keep ZIP CBSA
	rename CBSA CBSA_`y'
	merge 1:1 ZIP using "`temp'"
	replace CBSA = CBSA_`y' if CBSA==""
	drop _merge CBSA_`y'
	save "`temp'", replace
}

}


* ZIP - State
{
import excel "ZIP_COUNTY_122010.xlsx", firstrow clear
unique ZIP
gsort ZIP -TOT_RATIO // keep the major one
duplicates drop ZIP, force
gen state = substr(COUNTY,1,2)
keep ZIP state
tempfile temp2
save "`temp2'"

forvalues i = 2011/2016 {
	import excel "ZIP_COUNTY_12`i'.xlsx", firstrow clear
	unique ZIP
	gsort ZIP -TOT_RATIO // keep the major one
	duplicates drop ZIP, force
	gen state = substr(COUNTY,1,2)
	keep ZIP state
	rename state state_`y'
	merge 1:1 ZIP using "`temp2'"
	replace state = state_`y' if state==""
	drop _merge state_`y'
	save "`temp2'", replace
}
merge 1:1 ZIP using "`temp'", nogen
}


* ZIP - CBA
merge 1:1 ZIP using "ZIP_CBA"
drop _merge
sort CBSA ZIP
replace CBSA = "" if CBSA=="99999"


* ZIP list from claim data
rename ZIP zip
merge 1:1 zip using "ZIP_list.dta"
rename zip ZIP


*** Minor Adjustment ***
{
* CBSA Division to CBSA
replace CBSA = "14460" if CBSA=="14484"
replace CBSA = "16980" if CBSA=="16974"
replace CBSA = "19100" if CBSA=="19124"
replace CBSA = "35620" if CBSA=="20764"
replace CBSA = "16980" if CBSA=="23844"
replace CBSA = "16980" if CBSA=="29404"
replace CBSA = "31100" if CBSA=="31084"
replace CBSA = "33100" if CBSA=="33124"
replace CBSA = "35620" if CBSA=="35004"
replace CBSA = "35620" if CBSA=="35084"
replace CBSA = "35620" if CBSA=="35644"
replace CBSA = "41860" if CBSA=="36084"
replace CBSA = "37980" if CBSA=="37964"
replace CBSA = "31100" if CBSA=="42044"
replace CBSA = "42660" if CBSA=="42644"
replace CBSA = "19820" if CBSA=="47644"
replace CBSA = "47900" if CBSA=="47894"

* Changes in CBSA (https://www.ffiec.gov/geocode/help1.aspx)
replace CBSA = "35840" if CBSA=="14600"
replace CBSA = "18880" if CBSA=="23020"
replace CBSA = "44600" if CBSA=="48260"


* CBA - CBSA
unique CBSA
gen cba_round = 0
replace cba_round = 1 if round_1==1
replace cba_round = 2 if round_2==1
replace CBSA = "" if round_1!=round_3
replace CBA = "" if round_1!=round_3
replace cba_round = 0 if round_1!=round_3
keep ZIP CBSA CBA cba_round state

destring ZIP, replace
bysort CBA CBSA: egen n = count(ZIP)
bysort CBA: egen temp1 = count(ZIP)
replace n = . if CBSA==""
bysort CBA: egen temp2 = max(n)
gen temp3 = temp2/temp1 if n==temp2 & CBA!=""
sum temp3, de
destring CBSA, gen(cbsa)
gen temp = cbsa if n==temp2 & CBA!=""
drop cbsa
bysort CBA: egen cbsa = mean(temp)
tostring cbsa, replace
replace CBSA = cbsa if CBA!=""
drop temp* cbsa
unique CBA CBSA if CBA!=""

* Drop the tiny part of CBSA that are not included in CBA
bysort CBSA: egen temp1 = count(n)
bysort CBSA: egen temp2 = max(n)
gen temp3 = temp2/temp1 if n==temp2
sum temp3, de
replace CBSA = "" if n!=temp2 & CBA==""
drop temp* n

unique CBA
unique CBSA
unique CBA CBSA

gen geo_id = CBSA if cba_round==0
egen temp = group(CBA)
tostring temp, replace
replace geo_id = "CBA"+temp if CBA!=""
drop temp

forvalues i = 0/2 {
	unique geo_id if cba_round==`i'
	unique CBA if cba_round==`i'
	unique CBSA if cba_round==`i'
	unique CBA CBSA if cba_round==`i'
}

duplicates tag CBSA, gen(dup1)
duplicates tag CBA CBSA, gen(dup2)
tab CBA CBSA if dup1!=dup2, m // Large MSAs subdivide into multiple CBAs
drop dup*

save "..\ZIP_CBA_CBSA.dta", replace

keep CBSA CBA cba_round
duplicates drop CBSA cba_round, force
merge 1:1 CBSA using ".\shapefile_MSA_2010\usdb"
gen urban = 1 if LSAD=="Metro"
replace urban = 0 if LSAD=="Micro"

keep CBSA urban
unique CBSA
drop if CBSA==""
merge 1:m CBSA using "..\ZIP_CBA_CBSA", nogen keep(2 3)
order ZIP geo_id CBSA CBA cba_round urban state
sort geo_id

tostring ZIP, replace
replace ZIP = "0"+ZIP if length(ZIP)==4
replace ZIP = "00"+ZIP if length(ZIP)==3
save "..\ZIP_CBA_CBSA", replace
}

* Statement FIPS - Name
{
import excel "state_name.xlsx", firstrow clear
tostring state, replace
replace state = "0"+state if length(state)==1
merge 1:m state using "..\ZIP_CBA_CBSA", nogen
save "..\ZIP_CBA_CBSA", replace
}


*** Adjacent CBSAs ***
{
use ".\shapefile_MSA_2010\uscoord", clear

* order by lat/lon and by increasing _ID for co-located points
sort _Y _X _ID

* drop polygon's start point (missing coord) and duplicates
by _Y _X _ID: keep if _n == 1 & !mi(_Y,_X)

* drop points that are not co-located and convert to a wide layout
by _Y _X: gen N = _N
drop if N == 1
sum N
local maxN = r(max)
rename _ID _ID_1
forvalues i = 2/`maxN' {
    by _Y _X: gen long _ID_`i' = _ID_1[`i']
}

* reduce to one obs per distinct co-located _ID set
keep _ID*
bysort _ID*: keep if _n == 1

* convert co-located _ID sets to a long layout
gen long set = _n
reshape long _ID_, i(set)
drop if mi(_ID_)
drop _j

* merge with the shapefile database to recover the zcta identifier
rename _ID_ id
merge m:1 id using ".\shapefile_MSA_2010\usdb", assert(match using) keep(match) keepusing(CBSA) nogen
drop id

* Merge in CBA 
preserve
use "..\ZIP_CBA_CBSA", clear
keep if cba_round!=0
keep CBSA cba_round
duplicates drop
tempfile temp
save "`temp'"
restore
merge m:1 CBSA using "`temp'", keep(1 3) nogen
bysort set: egen temp1 = min(cba_round)
bysort CBSA: egen CBA_adjacent = min(temp1)
replace CBA_adjacent = . if cba_round!=.

gen temp2 = 1 if CBSA=="33100" // Adjacent to "Miami, FL"
bysort set: egen temp3 = max(temp2)
tab CBSA if temp3==1

keep CBSA CBA_adjacent
duplicates drop

merge 1:m CBSA using "..\ZIP_CBA_CBSA", keep(2 3) nogen
save "..\ZIP_CBA_CBSA", replace
}